import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
- Load the Dataset
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Display the first few rows of the dataset
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
- Data Preprocessing
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib # For loading and saving models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Step 1: Load and preprocess the dataset
df = pd.read_csv('/content/insurance.csv')
# One-hot encode categorical variables (sex, smoker, region)
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Prepare data for modeling
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 2: Train and Save Models
models = {
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}
# Train and save each model
for name, model in models.items():
model.fit(X_train, y_train)
joblib.dump(model, f'final_{name.lower().replace(" ", "_")}_model.pkl')
# Step 3: Load and Test the Saved Models
# Load the saved models
try:
loaded_rf_model = joblib.load('final_random_forest_model.pkl')
loaded_gb_model = joblib.load('final_gradient_boosting_model.pkl')
loaded_xgb_model = joblib.load('final_xgboost_model.pkl')
# Make predictions
test_rf_pred = loaded_rf_model.predict(X_test)
test_gb_pred = loaded_gb_model.predict(X_test)
test_xgb_pred = loaded_xgb_model.predict(X_test)
# Evaluate the performance of the loaded models
def evaluate_model(predictions, true_values, model_name):
mae = mean_absolute_error(true_values, predictions)
rmse = np.sqrt(mean_squared_error(true_values, predictions))
r2 = r2_score(true_values, predictions)
print(f"\n{model_name} Performance:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-Squared: {r2:.2f}")
evaluate_model(test_rf_pred, y_test, "Random Forest")
evaluate_model(test_gb_pred, y_test, "Gradient Boosting")
evaluate_model(test_xgb_pred, y_test, "XGBoost")
except FileNotFoundError as e:
print(f"Error: {e}")
Random Forest Performance: Mean Absolute Error: 2667.15 Root Mean Squared Error: 4657.03 R-Squared: 0.85 Gradient Boosting Performance: Mean Absolute Error: 2489.00 Root Mean Squared Error: 4435.72 R-Squared: 0.87 XGBoost Performance: Mean Absolute Error: 2815.97 Root Mean Squared Error: 4908.25 R-Squared: 0.84
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Load the Dataset
dataset_path = '/content/insurance.csv' # Update this path as needed
df = pd.read_csv(dataset_path)
# Step 2: Inspect the Dataset
print(f"Total number of records: {len(df)}")
print(f"Column headers: {df.columns.tolist()}")
print(df.head())
# Step 3: Data Preprocessing
# 3.1 Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# 3.2 Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# 3.3 Feature Scaling (Normalization)
scaler = StandardScaler()
features = ['age', 'bmi', 'children']
df[features] = scaler.fit_transform(df[features])
# Step 4: Define Features and Target Variable
X = df.drop('charges', axis=1)
y = df['charges']
# Step 5: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Step 6: Model Selection and Training
# 6.1 RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
# 6.2 GradientBoostingRegressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
# Step 7: Model Evaluation on Test Data
# 7.1 RandomForestRegressor Predictions
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)
# 7.2 GradientBoostingRegressor Predictions
gb_pred = gb_model.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_pred)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_r2 = r2_score(y_test, gb_pred)
print("\nRandom Forest Metrics:")
print(f"MAE: {rf_mae}")
print(f"RMSE: {rf_rmse}")
print(f"R²: {rf_r2}")
print("\nGradient Boosting Metrics:")
print(f"MAE: {gb_mae}")
print(f"RMSE: {gb_rmse}")
print(f"R²: {gb_r2}")
# Step 8: Cross-Validation for Stability Check
rf_cv_scores = cross_val_score(rf_model, X, y, cv=10, scoring='r2')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=10, scoring='r2')
print("\nRandom Forest Cross-Validation R² Scores:")
print(rf_cv_scores)
print(f"Mean R²: {rf_cv_scores.mean()}")
print("\nGradient Boosting Cross-Validation R² Scores:")
print(gb_cv_scores)
print(f"Mean R²: {gb_cv_scores.mean()}")
# Step 9: Visualizations
# 9.1 Dataset Analysis
# Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Age Distribution')
plt.savefig('age_distribution.png')
plt.show()
# BMI distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['bmi'], kde=True, bins=20)
plt.title('BMI Distribution')
plt.savefig('bmi_distribution.png')
plt.show()
# Charges distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], kde=True, bins=20)
plt.title('Charges Distribution')
plt.savefig('charges_distribution.png')
plt.show()
# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.show()
# 9.2 Model Performance Visualizations
# Actual vs Predicted Plot for Random Forest
plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_pred, alpha=0.6, label='Random Forest')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges (Random Forest)')
plt.legend()
plt.savefig('rf_actual_vs_predicted.png')
plt.show()
# Actual vs Predicted Plot for Gradient Boosting
plt.figure(figsize=(10, 6))
plt.scatter(y_test, gb_pred, alpha=0.6, label='Gradient Boosting')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges (Gradient Boosting)')
plt.legend()
plt.savefig('gb_actual_vs_predicted.png')
plt.show()
# Residual Plot for Random Forest
rf_residuals = y_test - rf_pred
plt.figure(figsize=(10, 6))
sns.histplot(rf_residuals, kde=True)
plt.title('Distribution of Residuals (Random Forest)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('rf_residual_distribution.png')
plt.show()
# Residual Plot for Gradient Boosting
gb_residuals = y_test - gb_pred
plt.figure(figsize=(10, 6))
sns.histplot(gb_residuals, kde=True)
plt.title('Distribution of Residuals (Gradient Boosting)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('gb_residual_distribution.png')
plt.show()
# 9.3 Learning Curves for both models
train_sizes, rf_train_scores, rf_test_scores = learning_curve(
rf_model, X, y, cv=10, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, 10)
)
train_sizes, gb_train_scores, gb_test_scores = learning_curve(
gb_model, X, y, cv=10, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, 10)
)
rf_train_scores_mean = -np.mean(rf_train_scores, axis=1)
rf_test_scores_mean = -np.mean(rf_test_scores, axis=1)
gb_train_scores_mean = -np.mean(gb_train_scores, axis=1)
gb_test_scores_mean = -np.mean(gb_test_scores, axis=1)
# Learning curve for Random Forest
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, rf_train_scores_mean, 'o-', color='r', label='Training error (RF)')
plt.plot(train_sizes, rf_test_scores_mean, 'o-', color='g', label='Validation error (RF)')
plt.xlabel('Training Size')
plt.ylabel('Error')
plt.title('Learning Curve (Random Forest)')
plt.legend(loc='best')
plt.savefig('rf_learning_curve.png')
plt.show()
# Learning curve for Gradient Boosting
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, gb_train_scores_mean, 'o-', color='r', label='Training error (GB)')
plt.plot(train_sizes, gb_test_scores_mean, 'o-', color='g', label='Validation error (GB)')
plt.xlabel('Training Size')
plt.ylabel('Error')
plt.title('Learning Curve (Gradient Boosting)')
plt.legend(loc='best')
plt.savefig('gb_learning_curve.png')
plt.show()
# Step 10: Save the Models
import joblib
joblib.dump(rf_model, 'final_rf_model.pkl')
joblib.dump(gb_model, 'final_gb_model.pkl')
# Step 11: Load and Test the Saved Models (Optional)
# Load the saved models
loaded_rf_model = joblib.load('final_random_forest_model.pkl')
loaded_gb_model = joblib.load('final_gradient_boosting_model.pkl')
loaded_xgb_model = joblib.load('final_xgboost_model.pkl')
# Make predictions
test_rf_pred = loaded_rf_model.predict(X_test)
test_gb_pred = loaded_gb_model.predict(X_test)
test_xgb_pred = loaded_xgb_model.predict(X_test)
# Evaluate the performance of the loaded models
def evaluate_model(predictions, true_values, model_name):
mae = mean_absolute_error(true_values, predictions)
rmse = np.sqrt(mean_squared_error(true_values, predictions))
r2 = r2_score(true_values, predictions)
print(f"\n{model_name} Performance:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-Squared: {r2:.2f}")
evaluate_model(test_rf_pred, y_test, "Random Forest")
evaluate_model(test_gb_pred, y_test, "Gradient Boosting")
evaluate_model(test_xgb_pred, y_test, "XGBoost")
Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64 Random Forest Metrics: MAE: 2664.9715886942795 RMSE: 4634.451593997286 R²: 0.8535158195419218 Gradient Boosting Metrics: MAE: 2490.6412888151885 RMSE: 4438.103389963076 R²: 0.8656651002627456 Random Forest Cross-Validation R² Scores: [0.86423437 0.84087091 0.81698301 0.72623614 0.85542821 0.88821693 0.85690537 0.79614713 0.84282273 0.85853391] Mean R²: 0.8346378711809906 Gradient Boosting Cross-Validation R² Scores: [0.88636658 0.8687801 0.8403614 0.74483395 0.86702461 0.92481761 0.8810778 0.82014416 0.86095864 0.86414137] Mean R²: 0.8558506219727745
Random Forest Performance: Mean Absolute Error: 7604.09 Root Mean Squared Error: 11604.48 R-Squared: 0.08 Gradient Boosting Performance: Mean Absolute Error: 8556.39 Root Mean Squared Error: 12249.33 R-Squared: -0.02 XGBoost Performance: Mean Absolute Error: 8953.14 Root Mean Squared Error: 12489.64 R-Squared: -0.06
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming df is your DataFrame
df = pd.read_csv('/content/insurance.csv')
# 1. Distribution of Charges
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], kde=True, bins=30)
plt.title('Distribution of Healthcare Costs (Charges)')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.savefig('distribution_of_charges.png')
plt.show()
# 2. Charges by Age Group
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='charges', data=df)
plt.title('Healthcare Costs by Age')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.savefig('charges_by_age.png')
plt.show()
# 3. Charges by BMI
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='charges', data=df)
plt.title('Healthcare Costs by BMI')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.savefig('charges_by_bmi.png')
plt.show()
# 4. Charges by Smoking Status
# Check if 'smoker' column exists, else use 'smoker_yes' if it was one-hot encoded
if 'smoker' in df.columns:
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoker', y='charges', data=df)
plt.title('Healthcare Costs by Smoking Status')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.savefig('charges_by_smoking_status.png')
plt.show()
else:
# Assuming 'smoker_yes' and 'smoker_no' exist after get_dummies
df['smoker'] = df['smoker_yes'] if 'smoker_yes' in df.columns else df['smoker_no']
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoker', y='charges', data=df)
plt.title('Healthcare Costs by Smoking Status')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.savefig('charges_by_smoking_status.png')
plt.show()
# 5. Charges by Region
# If region was one-hot encoded, this will plot each region separately
if 'region' in df.columns:
plt.figure(figsize=(10, 6))
sns.boxplot(x='region', y='charges', data=df)
plt.title('Healthcare Costs by Region')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.savefig('charges_by_region.png')
plt.show()
else:
# Assuming regions were one-hot encoded, plot each as a separate series
plt.figure(figsize=(10, 6))
regions = [col for col in df.columns if 'region_' in col]
for region in regions:
sns.boxplot(y='charges', data=df[df[region] == 1], label=region)
plt.title('Healthcare Costs by Region')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.legend()
plt.savefig('charges_by_region.png')
plt.show()
# 6. Charges by Number of Children
plt.figure(figsize=(10, 6))
sns.boxplot(x='children', y='charges', data=df)
plt.title('Healthcare Costs by Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.savefig('charges_by_children.png')
plt.show()
# 7. Interaction Effect: Charges by Age and Smoking Status
if 'smoker' in df.columns:
plt.figure(figsize=(10, 6))
sns.lmplot(x='age', y='charges', hue='smoker', data=df, aspect=2, height=6, ci=None, palette='muted')
plt.title('Interaction Effect: Age and Smoking Status on Healthcare Costs')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.savefig('interaction_age_smoker.png')
plt.show()
<Figure size 1000x600 with 0 Axes>
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# One-hot encode categorical variables (sex, smoker, region)
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Check the columns to confirm the correct ones for region
print("Columns after encoding:", df.columns)
# Set up the matplotlib figure
plt.figure(figsize=(16, 12))
# 1. Charges by Age
plt.subplot(2, 3, 1)
sns.scatterplot(x='age', y='charges', data=df)
plt.title('Healthcare Costs by Age')
plt.xlabel('Age')
plt.ylabel('Charges')
# 2. Charges by BMI
plt.subplot(2, 3, 2)
sns.scatterplot(x='bmi', y='charges', data=df)
plt.title('Healthcare Costs by BMI')
plt.xlabel('BMI')
plt.ylabel('Charges')
# 3. Charges by Smoking Status
plt.subplot(2, 3, 3)
sns.boxplot(x='smoker_yes', y='charges', data=df)
plt.title('Healthcare Costs by Smoking Status')
plt.xlabel('Smoker (1=Yes, 0=No)')
plt.ylabel('Charges')
# 4. Charges by Region
plt.subplot(2, 3, 4)
# Check which region columns exist
region_columns = [col for col in df.columns if col.startswith('region_')]
if region_columns:
# Create a 'region' column to represent regions more clearly after one-hot encoding
df['region'] = df[region_columns].idxmax(axis=1).apply(lambda x: x.split('_')[1].capitalize())
sns.boxplot(x='region', y='charges', data=df)
plt.title('Healthcare Costs by Region')
plt.xlabel('Region')
plt.ylabel('Charges')
else:
print("No region columns found. Please check the encoding step.")
# 5. Charges by Number of Children
plt.subplot(2, 3, 5)
sns.boxplot(x='children', y='charges', data=df)
plt.title('Healthcare Costs by Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
# 6. Interaction: Age and Smoking Status
plt.subplot(2, 3, 6)
sns.scatterplot(x='age', y='charges', hue='smoker_yes', data=df, palette='muted')
plt.title('Interaction: Age and Smoking Status on Costs')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.legend(title='Smoker (1=Yes, 0=No)', loc='upper left')
# Adjust layout
plt.tight_layout()
plt.savefig('comparative_analysis.png')
plt.show()
Columns after encoding: Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes',
'region_northwest', 'region_southeast', 'region_southwest'],
dtype='object')
convert .html file code
Figure 1
figure 2
import matplotlib.pyplot as plt
import numpy as np
# Performance metrics for Random Forest
rf_metrics = {
'Mean Absolute Error (MAE)': 2667.15,
'Root Mean Squared Error (RMSE)': 4657.03,
'R-Squared (R²)': 0.85
}
# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(rf_metrics.keys(), rf_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])
# Adding text annotations
for bar in bars:
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')
plt.title('Random Forest Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_2_random_forest_performance.png')
plt.show()
Figure 3: Gradient Boosting Regressor Performance
# Performance metrics for Gradient Boosting
gb_metrics = {
'Mean Absolute Error (MAE)': 2489.00,
'Root Mean Squared Error (RMSE)': 4435.72,
'R-Squared (R²)': 0.87
}
# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(gb_metrics.keys(), gb_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])
# Adding text annotations
for bar in bars:
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')
plt.title('Gradient Boosting Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_3_gradient_boosting_performance.png')
plt.show()
Figure 4: XGBoost Regressor Performance
# Performance metrics for XGBoost
xgb_metrics = {
'Mean Absolute Error (MAE)': 2815.97,
'Root Mean Squared Error (RMSE)': 4908.25,
'R-Squared (R²)': 0.84
}
# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(xgb_metrics.keys(), xgb_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])
# Adding text annotations
for bar in bars:
yval = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')
plt.title('XGBoost Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_4_xgboost_performance.png')
plt.show()
Figure 5: Random Forest Regressor Cross-Validation R² Scores
import matplotlib.pyplot as plt
import seaborn as sns
# Sample R² scores for Random Forest from cross-validation
rf_cv_r2_scores = [0.864, 0.841, 0.817, 0.726, 0.855, 0.888, 0.857, 0.796, 0.843, 0.859]
# Plotting the Random Forest Regressor cross-validation R² scores
plt.figure(figsize=(10, 6))
sns.boxplot(y=rf_cv_r2_scores, color='skyblue')
plt.title('Random Forest Regressor Cross-Validation R² Scores')
plt.ylabel('R² Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_5_rf_cv_r2_scores.png')
plt.show()
Figure 6: Gradient Boosting Regressor Cross-Validation R² Scores
# Sample R² scores for Gradient Boosting from cross-validation
gb_cv_r2_scores = [0.886, 0.869, 0.840, 0.745, 0.867, 0.925, 0.881, 0.820, 0.861, 0.864]
# Plotting the Gradient Boosting Regressor cross-validation R² scores
plt.figure(figsize=(10, 6))
sns.boxplot(y=gb_cv_r2_scores, color='lightgreen')
plt.title('Gradient Boosting Regressor Cross-Validation R² Scores')
plt.ylabel('R² Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_6_gb_cv_r2_scores.png')
plt.show()
Figure 4.2.1: Age Distribution
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# 4.2.1 Plotting the distribution of age
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=20, kde=True, color='blue')
plt.title('Figure 4.2.1: Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
Figure 4.2.2: BMI Distribution
# Plotting the distribution of BMI
plt.figure(figsize=(10, 6))
sns.histplot(data['bmi'], bins=20, kde=True, color='orange')
plt.title('Figure 4.2.2: Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_2_BMI_Distribution.png') # Save the figure
plt.show()
Figure 4.2.3: Healthcare Costs Distribution
# Plotting the distribution of healthcare costs
plt.figure(figsize=(10, 6))
sns.histplot(data['charges'], bins=20, kde=True, color='green')
plt.title('Figure 4.2.3: Distribution of Healthcare Costs')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_3_Healthcare_Costs.png') # Save the figure
plt.show()
Figure 4.2.4: Correlation Matrix
# Selecting only numeric columns
numeric_data = data.select_dtypes(include=['float64', 'int64'])
# Plotting the correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_data.corr() # Compute correlation matrix only on numeric columns
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Figure 4.2.4: Correlation Matrix')
plt.xlabel('Features')
plt.ylabel('Features')
plt.grid(True)
plt.savefig('/content/Figure_4_2_4_Correlation_Matrix.png') # Save the figure
plt.show()
Figure 4.2.5: Random Forest - Actual vs Predicted Charges
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# Preparing the data
X = data.drop('charges', axis=1)
y = data['charges']
# Encoding categorical features
X = pd.get_dummies(X)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train the Random Forest model
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
# Predicting
y_pred_rf = model_rf.predict(X_test)
# Plotting actual vs predicted charges
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_rf, color='purple')
plt.plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
plt.title('Figure 4.2.5: Actual vs Predicted Charges (Random Forest)')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_5_Actual_vs_Predicted_RF.png') # Save the figure
plt.show()
Figure 4.2.6: Gradient Boosting - Actual vs Predicted Charges
from sklearn.ensemble import GradientBoostingRegressor
# Train the Gradient Boosting model
model_gb = GradientBoostingRegressor()
model_gb.fit(X_train, y_train)
# Predicting
y_pred_gb = model_gb.predict(X_test)
# Plotting actual vs predicted charges
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_gb, color='cyan')
plt.plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
plt.title('Figure 4.2.6: Actual vs Predicted Charges (Gradient Boosting)')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_6_Actual_vs_Predicted_GB.png') # Save the figure
plt.show()
Figure 4.2.7: Random Forest - Residual Analysis
# Calculating residuals
residuals_rf = y_test - y_pred_rf
# Plotting residuals distribution
plt.figure(figsize=(10, 6))
sns.histplot(residuals_rf, bins=20, kde=True, color='magenta')
plt.title('Figure 4.2.7: Residual Distribution (Random Forest)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_7_Residuals_RF.png') # Save the figure
plt.show()
Figure 4.2.8: Gradient Boosting - Residual Analysis
# Calculating residuals
residuals_gb = y_test - y_pred_gb
# Plotting residuals distribution
plt.figure(figsize=(10, 6))
sns.histplot(residuals_gb, bins=20, kde=True, color='brown')
plt.title('Figure 4.2.8: Residual Distribution (Gradient Boosting)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_8_Residuals_GB.png') # Save the figure
plt.show()
Figure 4.2.9: Random Forest - Learning Curve
from sklearn.model_selection import learning_curve
# Plotting learning curve
plt.figure(figsize=(10, 6))
train_sizes, train_scores, test_scores = learning_curve(model_rf, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
plt.plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
plt.plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
plt.title('Figure 4.2.9: Learning Curve (Random Forest)')
plt.xlabel('Training Size')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.savefig('/content/Figure_4_2_9_Learning_Curve_RF.png') # Save the figure
plt.show()
Figure 4.2.10: Gradient Boosting - Learning Curve
# Plotting learning curve
plt.figure(figsize=(10, 6))
train_sizes, train_scores, test_scores = learning_curve(model_gb, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
plt.plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
plt.plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
plt.title('Figure 4.2.10: Learning Curve (Gradient Boosting)')
plt.xlabel('Training Size')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.savefig('/content/Figure_4_2_10_Learning_Curve_GB.png') # Save the figure
plt.show()
Figure 4.2.11: Overall Distribution of Healthcare Costs
# Plotting the overall distribution of healthcare costs
plt.figure(figsize=(10, 6))
sns.histplot(data['charges'], bins=30, kde=True, color='teal')
plt.title('Figure 4.2.11: Overall Distribution of Healthcare Costs')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_11_Overall_Distribution_Costs.png') # Save the figure
plt.show()
Figure 4.2.12: Relationship Between Age and Healthcare Costs
# Plotting the relationship between age and healthcare costs
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['age'], y=data['charges'], color='red')
plt.title('Figure 4.2.12: Relationship Between Age and Healthcare Costs')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_12_Age_vs_Costs.png') # Save the figure
plt.show()
Figure 4.2.13: Relationship Between BMI and Healthcare Costs
# Plotting the relationship between BMI and healthcare costs
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['bmi'], y=data['charges'], color='purple')
plt.title('Figure 4.2.13: Relationship Between BMI and Healthcare Costs')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_13_BMI_vs_Costs.png') # Save the figure
plt.show()
Figure 4.2.14: Impact of Smoking on Healthcare Costs
# Plotting the impact of smoking status on healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2')
plt.title('Figure 4.2.14: Impact of Smoking Status on Healthcare Costs')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_14_Smoking_Status_vs_Costs.png') # Save the figure
plt.show()
<ipython-input-33-a14e15d04678>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2')
Figure 4.2.15: Regional Differences in Healthcare Costs
# Plotting regional differences in healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['region'], y=data['charges'], palette='Set1')
plt.title('Figure 4.2.15: Regional Differences in Healthcare Costs')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_15_Regional_Differences.png') # Save the figure
plt.show()
<ipython-input-34-ee4208271dbd>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['region'], y=data['charges'], palette='Set1')
Figure 4.2.16: Number of Children vs. Healthcare Costs
# Plotting the relationship between number of children and healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1')
plt.title('Figure 4.2.16: Number of Children and Healthcare Costs')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_16_Children_vs_Costs.png') # Save the figure
plt.show()
<ipython-input-35-882bfd53e39f>:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1')
Figure 4.2.17: Interaction Between Age and Smoking on Healthcare Costs
# Creating a combined column for age and smoking status interaction
data['age_smoker'] = data['age'].astype(str) + '_' + data['smoker']
# Plotting the interaction between age and smoking status
plt.figure(figsize=(12, 8))
sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis')
plt.title('Figure 4.2.17: Interaction Between Age and Smoking Status')
plt.xlabel('Age and Smoking Status')
plt.ylabel('Charges')
plt.xticks(rotation=90) # Rotate x-axis labels for readability
plt.grid(True)
plt.savefig('/content/Figure_4_2_17_Age_Smoking_Interaction.png') # Save the figure
plt.show()
<ipython-input-36-c6b1813f4d4f>:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis')
all graphs in single file
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv('/content/insurance.csv')
# Create a figure with a grid of subplots
fig, axs = plt.subplots(4, 5, figsize=(20, 16)) # 4 rows, 5 columns
# Flatten the axes array for easy iteration
axs = axs.flatten()
# Plotting the distribution of age (Figure 4.2.1)
sns.histplot(data['age'], bins=20, kde=True, color='blue', ax=axs[0])
axs[0].set_title('Figure 4.2.1: Distribution of Age')
axs[0].set_xlabel('Age')
axs[0].set_ylabel('Frequency')
# Plotting the distribution of BMI (Figure 4.2.2)
sns.histplot(data['bmi'], bins=20, kde=True, color='orange', ax=axs[1])
axs[1].set_title('Figure 4.2.2: Distribution of BMI')
axs[1].set_xlabel('BMI')
axs[1].set_ylabel('Frequency')
# Plotting the distribution of healthcare costs (Figure 4.2.3)
sns.histplot(data['charges'], bins=20, kde=True, color='green', ax=axs[2])
axs[2].set_title('Figure 4.2.3: Distribution of Healthcare Costs')
axs[2].set_xlabel('Charges')
axs[2].set_ylabel('Frequency')
# Plotting the correlation matrix (Figure 4.2.4)
numeric_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axs[3])
axs[3].set_title('Figure 4.2.4: Correlation Matrix')
# Plotting actual vs predicted charges (Random Forest) (Figure 4.2.5)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X = data.drop('charges', axis=1)
y = data['charges']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
sns.scatterplot(x=y_test, y=y_pred_rf, color='purple', ax=axs[4])
axs[4].plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
axs[4].set_title('Figure 4.2.5: Actual vs Predicted Charges (Random Forest)')
axs[4].set_xlabel('Actual Charges')
axs[4].set_ylabel('Predicted Charges')
# Plotting actual vs predicted charges (Gradient Boosting) (Figure 4.2.6)
from sklearn.ensemble import GradientBoostingRegressor
model_gb = GradientBoostingRegressor()
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)
sns.scatterplot(x=y_test, y=y_pred_gb, color='cyan', ax=axs[5])
axs[5].plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
axs[5].set_title('Figure 4.2.6: Actual vs Predicted Charges (Gradient Boosting)')
axs[5].set_xlabel('Actual Charges')
axs[5].set_ylabel('Predicted Charges')
# Plotting residuals distribution (Random Forest) (Figure 4.2.7)
residuals_rf = y_test - y_pred_rf
sns.histplot(residuals_rf, bins=20, kde=True, color='magenta', ax=axs[6])
axs[6].set_title('Figure 4.2.7: Residual Distribution (Random Forest)')
axs[6].set_xlabel('Residuals')
axs[6].set_ylabel('Frequency')
# Plotting residuals distribution (Gradient Boosting) (Figure 4.2.8)
residuals_gb = y_test - y_pred_gb
sns.histplot(residuals_gb, bins=20, kde=True, color='brown', ax=axs[7])
axs[7].set_title('Figure 4.2.8: Residual Distribution (Gradient Boosting)')
axs[7].set_xlabel('Residuals')
axs[7].set_ylabel('Frequency')
# Plotting learning curve (Random Forest) (Figure 4.2.9)
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model_rf, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
axs[8].plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
axs[8].plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
axs[8].set_title('Figure 4.2.9: Learning Curve (Random Forest)')
axs[8].set_xlabel('Training Size')
axs[8].set_ylabel('Mean Squared Error')
axs[8].legend()
axs[8].grid(True)
# Plotting learning curve (Gradient Boosting) (Figure 4.2.10)
train_sizes, train_scores, test_scores = learning_curve(model_gb, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
axs[9].plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
axs[9].plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
axs[9].set_title('Figure 4.2.10: Learning Curve (Gradient Boosting)')
axs[9].set_xlabel('Training Size')
axs[9].set_ylabel('Mean Squared Error')
axs[9].legend()
axs[9].grid(True)
# Plotting overall distribution of healthcare costs (Figure 4.2.11)
sns.histplot(data['charges'], bins=30, kde=True, color='teal', ax=axs[10])
axs[10].set_title('Figure 4.2.11: Overall Distribution of Healthcare Costs')
axs[10].set_xlabel('Charges')
axs[10].set_ylabel('Frequency')
# Plotting relationship between age and healthcare costs (Figure 4.2.12)
sns.scatterplot(x=data['age'], y=data['charges'], color='red', ax=axs[11])
axs[11].set_title('Figure 4.2.12: Relationship Between Age and Healthcare Costs')
axs[11].set_xlabel('Age')
axs[11].set_ylabel('Charges')
# Plotting relationship between BMI and healthcare costs (Figure 4.2.13)
sns.scatterplot(x=data['bmi'], y=data['charges'], color='purple', ax=axs[12])
axs[12].set_title('Figure 4.2.13: Relationship Between BMI and Healthcare Costs')
axs[12].set_xlabel('BMI')
axs[12].set_ylabel('Charges')
# Plotting impact of smoking status on healthcare costs (Figure 4.2.14)
sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2', ax=axs[13])
axs[13].set_title('Figure 4.2.14: Impact of Smoking Status on Healthcare Costs')
axs[13].set_xlabel('Smoker')
axs[13].set_ylabel('Charges')
# Plotting regional differences in healthcare costs (Figure 4.2.15)
sns.boxplot(x=data['region'], y=data['charges'], palette='Set1', ax=axs[14])
axs[14].set_title('Figure 4.2.15: Regional Differences in Healthcare Costs')
axs[14].set_xlabel('Region')
axs[14].set_ylabel('Charges')
# Plotting number of children and healthcare costs (Figure 4.2.16)
sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1', ax=axs[15])
axs[15].set_title('Figure 4.2.16: Number of Children and Healthcare Costs')
axs[15].set_xlabel('Number of Children')
axs[15].set_ylabel('Charges')
# Plotting interaction between age and smoking status (Figure 4.2.17)
data['age_smoker'] = data['age'].astype(str) + '_' + data['smoker']
sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis', ax=axs[16])
axs[16].set_title('Figure 4.2.17: Interaction Between Age and Smoking Status')
axs[16].set_xlabel('Age and Smoking Status')
axs[16].set_ylabel('Charges')
axs[16].tick_params(axis='x', rotation=90) # Rotate x-axis labels for readability
# Adjust layout to avoid overlap
plt.tight_layout()
# Save the entire figure
plt.savefig('/content/All_Figures_4_2.png') # Save the figure
plt.show()
<ipython-input-37-df516952e2aa>:124: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2', ax=axs[13]) <ipython-input-37-df516952e2aa>:130: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['region'], y=data['charges'], palette='Set1', ax=axs[14]) <ipython-input-37-df516952e2aa>:136: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1', ax=axs[15]) <ipython-input-37-df516952e2aa>:143: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis', ax=axs[16])
# Step 1: Upload the notebook file
from google.colab import files
# Upload notebook file
uploaded = files.upload()
# The uploaded file name
notebook_file = list(uploaded.keys())[0]
print(f"Uploaded file: {notebook_file}")
# Step 2: Install nbconvert if not already installed
!pip install nbconvert
# Step 3: Convert the notebook to HTML
import subprocess
# Define the output HTML file name
output_html_file = notebook_file.replace('.ipynb', '.html')
# Convert the notebook to HTML
try:
subprocess.run(['jupyter', 'nbconvert', '--to', 'html', notebook_file, '--output', output_html_file], check=True)
print(f"Conversion successful: {output_html_file} created.")
except subprocess.CalledProcessError as e:
print(f"Error during conversion: {e}")
# Step 4: Provide download link for the HTML file
from google.colab import files
# Download the HTML file
try:
files.download(output_html_file)
print(f"Download initiated for: {output_html_file}")
except Exception as e:
print(f"Error during download: {e}")
Saving Maheshinsurance (1).ipynb to Maheshinsurance (1).ipynb Uploaded file: Maheshinsurance (1).ipynb Requirement already satisfied: nbconvert in /usr/local/lib/python3.10/dist-packages (6.5.4) Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.9.4) Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.12.3) Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert) (6.1.0) Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.7.1) Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.4) Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (3.1.4) Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.2) Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.3.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.1.5) Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.8.4) Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.10.0) Requirement already satisfied: nbformat>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.10.4) Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert) (24.1) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.5.1) Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.16.1) Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.3.0) Requirement already satisfied: traitlets>=5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.1) Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.7->nbconvert) (4.2.2) Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.10/dist-packages (from nbclient>=0.5.0->nbconvert) (6.1.12) Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (2.20.0) Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (4.23.0) Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert) (2.5) Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (1.16.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (0.5.1) Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (24.2.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (2023.12.1) Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.35.1) Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.20.0) Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (24.0.1) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2) Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.3) Conversion successful: Maheshinsurance (1).html created.
Download initiated for: Maheshinsurance (1).html
DIagrams for the project
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
# Define the start date of the project
start_date = datetime(2024, 6, 30)
# Define the tasks and their durations (in days)
tasks = [
("Introduction", 7), # 1 week
("Literature Review", 14), # 2 weeks
("Methodology", 7), # 1 week
("Implementation", 14), # 2 weeks
("FInding & Analysis", 7), # 1 week
("Conclusion", 7) # 1 week
]
# Calculate task start and end dates
task_dates = []
current_start = start_date
for task, duration in tasks:
end_date = current_start + timedelta(days=duration)
task_dates.append((task, current_start, end_date))
current_start = end_date + timedelta(days=1) # Adding 1 day for the next task to start
# Create the Gantt chart
fig, ax = plt.subplots(figsize=(10, 6))
# Plot each task
for i, (task, start, end) in enumerate(task_dates):
ax.barh(i, (end - start).days, left=start, color='skyblue', edgecolor='black')
ax.text(start + (end - start) / 2, i, task, ha='center', va='center', color='black', fontsize=10)
# Set the date format on the x-axis
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=7))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Tasks')
ax.set_title('8-Week Project Plan (June 30th - September 5th, 2024)')
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels([task for task, start, end in task_dates])
# Invert the y-axis to have the first task at the top
ax.invert_yaxis()
# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)
# Show the grid for better readability
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
# Show the Gantt chart
plt.tight_layout()
plt.show()
2.4
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Define the data for the table
data = {
"Algorithm": ["Random Forest", "Gradient Boosting Machines (GBM)", "XGBoost"],
"Strengths": [
"Handles high-dimensional data effectively; reduces overfitting through ensemble learning; robust to noisy data.",
"Provides high accuracy by correcting errors of previous models; captures complex patterns and interactions.",
"Optimized for speed and performance; includes regularization to prevent overfitting; handles missing values effectively."
],
"Limitations": [
"Computationally intensive for large datasets; less transparent.",
"High risk of overfitting if not tuned correctly; requires significant computation.",
"Requires careful hyperparameter tuning; implementation complexity."
]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Plotting the table
fig, ax = plt.subplots(figsize=(10, 3)) # Set the figure size
ax.axis('tight')
ax.axis('off')
tbl = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
# Adjusting font size
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
# Adjust column widths
tbl.auto_set_column_width(col=list(range(len(df.columns))))
# Save the table as an image
plt.savefig('ML_Algorithm_Comparison_Table.png', bbox_inches='tight', dpi=300)
plt.show()
2.3
import matplotlib.pyplot as plt
from graphviz import Digraph
# Create a new directed graph
dot = Digraph()
# Define the nodes
dot.node('A', 'Lack of Comprehensive Data Integration')
dot.node('B', 'Model Interpretability Issues')
dot.node('C', 'Ethical Concerns\n(Data Privacy and Security)')
dot.node('D', 'Need for High-Quality Datasets')
# Define the connections (edges) between nodes
dot.edge('A', 'B')
dot.edge('B', 'C')
dot.edge('C', 'D')
# Display the graph
dot.render('ML_Healthcare_Gaps', format='png', cleanup=False)
dot
2.2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from graphviz import Digraph
from PIL import Image
import matplotlib.image as mpimg
# Step 1: Create and Save Table Image
# Define the data for the table
data = {
"Technique": ["Linear Regression", "Time Series Analysis", "Random Forest", "Gradient Boosting Machines (GBM)", "XGBoost"],
"Strengths": [
"Simple, well-defined relationships",
"Identifies patterns over time",
"Handles high-dimensional data; reduces overfitting",
"Captures complex patterns and interactions",
"Optimized for speed; handles large datasets"
],
"Limitations": [
"Oversimplifies non-linear relationships",
"Struggles with unexpected shifts and variability",
"Computationally intensive; less transparent",
"High risk of overfitting if not tuned correctly",
"Requires careful tuning; implementation complexity"
]
}
df = pd.DataFrame(data)
# Plotting the table
fig, ax = plt.subplots(figsize=(12, 3)) # Adjust the size as needed
ax.axis('tight')
ax.axis('off')
tbl = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
# Adjusting font size
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
# Adjust column widths
tbl.auto_set_column_width(col=list(range(len(df.columns))))
# Save the table as an image
plt.savefig('table_image.png', bbox_inches='tight', dpi=300)
plt.close()
# Step 2: Create and Save Bar Chart
fig, ax = plt.subplots(figsize=(12, 6))
# Combine strengths and limitations for each technique
strengths = np.array([2, 3, 4, 4, 4])
limitations = np.array([3, 3, 3, 4, 4])
index = np.arange(len(df["Technique"]))
bar_width = 0.35
bar1 = plt.bar(index, strengths, bar_width, label="Strengths", color='skyblue')
bar2 = plt.bar(index + bar_width, limitations, bar_width, label="Limitations", color='salmon')
plt.xlabel('Techniques')
plt.ylabel('Score (1-5)')
plt.title('Comparison of Strengths and Limitations')
plt.xticks(index + bar_width / 2, df["Technique"], rotation=30, ha="right")
plt.legend()
# Adding the text on top of the bars
for bar in bar1:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.1, yval, ha='center', va='bottom')
for bar in bar2:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.1, yval, ha='center', va='bottom')
# Save the bar chart as an image
plt.tight_layout()
plt.savefig('bar_chart_image.png', dpi=300)
plt.close()
# Step 3: Create and Save Flowchart
dot = Digraph()
# Define the nodes
dot.node('A', 'Traditional Techniques')
dot.node('B', 'Linear Regression')
dot.node('C', 'Time Series Analysis')
dot.node('D', 'Machine Learning Techniques')
dot.node('E', 'Random Forest')
dot.node('F', 'Gradient Boosting Machines (GBM)')
dot.node('G', 'XGBoost')
# Define the connections (edges) between nodes
dot.edge('A', 'B')
dot.edge('A', 'C')
dot.edge('A', 'D')
dot.edge('D', 'E')
dot.edge('D', 'F')
dot.edge('D', 'G')
# Render the flowchart to an image
dot.render('flowchart_image', format='png', cleanup=False)
# Step 4: Combine All Images into a Single Image
# Load images
table_img = Image.open('table_image.png')
bar_chart_img = Image.open('bar_chart_image.png')
flowchart_img = Image.open('flowchart_image.png')
# Create a new image with enough space to hold all three images
combined_width = max(table_img.width, bar_chart_img.width, flowchart_img.width)
combined_height = table_img.height + bar_chart_img.height + flowchart_img.height
combined_img = Image.new('RGB', (combined_width, combined_height), (255, 255, 255))
# Paste images into the combined image
combined_img.paste(table_img, (0, 0))
combined_img.paste(bar_chart_img, (0, table_img.height))
combined_img.paste(flowchart_img, (0, table_img.height + bar_chart_img.height))
# Save the combined image
combined_img.save('combined_image.png')
# Display the combined image to check the output
combined_img.show()
3 Methodology
from graphviz import Digraph
import os
# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
os.makedirs(save_path)
# Define the filename and full path
filename = 'methodology_flowchart.png'
full_path = os.path.join(save_path, filename)
# Define the diagram
dot = Digraph()
# Define the nodes
dot.node('A', 'Data Collection')
dot.node('B', 'Data Preprocessing')
dot.node('C', 'Algorithm Selection')
dot.node('D', 'Evaluation Metrics')
dot.node('E', 'Model Performance Assessment')
# Define the connections (edges) between nodes
dot.edge('A', 'B', label='Source Datasets from Kaggle & Academic Databases')
dot.edge('B', 'C', label='Ensure Data Quality and Relevance')
dot.edge('C', 'D', label='Use Advanced ML Algorithms')
dot.edge('D', 'E', label='Assess Model Performance')
# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)
print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/methodology_flowchart.png'
3.1
from graphviz import Digraph
import os
# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
os.makedirs(save_path)
# Define the filename and full path
filename = 'methodology_detailed_flowchart.png'
full_path = os.path.join(save_path, filename)
# Define the diagram
dot = Digraph()
# Define the nodes
dot.node('A', 'Traditional Methods')
dot.node('A1', 'Linear Regression')
dot.node('A2', 'Time-Series Analysis')
dot.node('B', 'Advanced Approach')
dot.node('B1', 'Medical Cost Personal Dataset')
dot.node('B2', 'Insights from IEEE Xplore & Google Scholar')
dot.node('C', 'Implementation')
dot.node('C1', 'Google Colab Environment')
dot.node('C2', 'Python Libraries\n(Pandas, NumPy, Scikit-learn)')
dot.node('C3', 'Data Visualization\n(Matplotlib, Seaborn)')
dot.node('C4', 'Google Drive Integration')
# Define the connections (edges) between nodes
dot.edge('A', 'B', label='Overcoming Limitations')
dot.edge('A1', 'A', label='Used for Basic Analysis')
dot.edge('A2', 'A', label='Used for Trend Identification')
dot.edge('B', 'C', label='Advanced Methodology')
dot.edge('B1', 'B', label='Comprehensive Dataset')
dot.edge('B2', 'B', label='Scholarly Insights')
dot.edge('C', 'C1', label='Computational Resources')
dot.edge('C', 'C2', label='Data Manipulation and ML')
dot.edge('C', 'C3', label='Model Performance Visualization')
dot.edge('C', 'C4', label='Data Storage and Access')
# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)
print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/methodology_detailed_flowchart.png'
3.2
from graphviz import Digraph
import os
# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
os.makedirs(save_path)
# Define the filename and full path
filename = 'data_preprocessing_flowchart.png'
full_path = os.path.join(save_path, filename)
# Define the diagram
dot = Digraph()
# Define the nodes for different preprocessing steps
dot.node('A', 'Data Preprocessing')
dot.node('A1', 'Data Cleaning')
dot.node('A2', 'Feature Engineering')
dot.node('A3', 'Normalization and Scaling')
dot.node('A4', 'Outlier Detection')
dot.node('B', 'Implementation in Google Colab')
# Define the nodes for details within each step
dot.node('A1.1', 'Remove Duplicate Entries')
dot.node('A1.2', 'Handle Missing Values:\n - Mean Imputation\n - Mode Imputation')
dot.node('A2.1', 'Generate Interaction Terms')
dot.node('A2.2', 'One-Hot Encoding')
dot.node('A3.1', 'Min-Max Scaling')
dot.node('A4.1', 'Detect Outliers using Z-Scores')
dot.node('A4.2', 'Transform or Remove Outliers')
# Define the connections (edges) between nodes
dot.edge('A', 'A1', label='Step 1: Data Cleaning')
dot.edge('A1', 'A1.1', label='Step 1.1')
dot.edge('A1', 'A1.2', label='Step 1.2')
dot.edge('A', 'A2', label='Step 2: Feature Engineering')
dot.edge('A2', 'A2.1', label='Step 2.1')
dot.edge('A2', 'A2.2', label='Step 2.2')
dot.edge('A', 'A3', label='Step 3: Normalization and Scaling')
dot.edge('A3', 'A3.1', label='Step 3.1')
dot.edge('A', 'A4', label='Step 4: Outlier Detection')
dot.edge('A4', 'A4.1', label='Step 4.1')
dot.edge('A4', 'A4.2', label='Step 4.2')
dot.edge('A', 'B', label='Implementation in Google Colab')
dot.edge('B', 'B', label='Facilitates All Steps')
# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)
print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/data_preprocessing_flowchart.png'
3.3
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw_random_forest_diagram():
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Figure 3.3.1: Random Forest Method', fontsize=14, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Define components
components = ['Decision Tree 1', 'Decision Tree 2', 'Decision Tree 3', 'Decision Tree 4', 'Aggregated Prediction']
positions = [(0.1, 0.6), (0.3, 0.6), (0.5, 0.6), (0.7, 0.6), (0.4, 0.3)]
# Draw components
for component, (x, y) in zip(components, positions):
rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightblue', lw=1.5)
ax.add_patch(rect)
ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')
# Draw connections
connections = [((0.1, 0.6), (0.4, 0.3)), ((0.3, 0.6), (0.4, 0.3)), ((0.5, 0.6), (0.4, 0.3)), ((0.7, 0.6), (0.4, 0.3))]
for start, end in connections:
ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))
# Save and show the figure
plt.savefig('/content/drive/MyDrive/Projects/random_forest_diagram.png', bbox_inches='tight')
plt.show()
draw_random_forest_diagram()
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw_gbm_diagram():
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Figure 3.3.2: Gradient Boosting Machines Method', fontsize=14, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Define components
components = ['Model 1', 'Model 2', 'Model 3', 'Model 4']
positions = [(0.2, 0.6), (0.2, 0.4), (0.2, 0.2), (0.2, 0.0)]
# Draw components
for component, (x, y) in zip(components, positions):
rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightgreen', lw=1.5)
ax.add_patch(rect)
ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')
# Draw connections
connections = [((0.2, 0.6), (0.2, 0.4)), ((0.2, 0.4), (0.2, 0.2)), ((0.2, 0.2), (0.2, 0.0))]
for start, end in connections:
ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))
# Save and show the figure
plt.savefig('/content/drive/MyDrive/Projects/gbm_diagram.png', bbox_inches='tight')
plt.show()
draw_gbm_diagram()
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw_xgboost_diagram():
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Figure 3.3.3: XGBoost Method', fontsize=14, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Define components
components = ['XGBoost', 'Regularization', 'Parallel Processing']
positions = [(0.3, 0.4), (0.1, 0.6), (0.5, 0.6)]
# Draw components
for component, (x, y) in zip(components, positions):
rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightcoral', lw=1.5)
ax.add_patch(rect)
ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')
# Draw connections
connections = [((0.3, 0.4), (0.1, 0.6)), ((0.3, 0.4), (0.5, 0.6))]
for start, end in connections:
ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))
# Save and show the figure
plt.savefig('/content/drive/MyDrive/Projects/xgboost_diagram.png', bbox_inches='tight')
plt.show()
draw_xgboost_diagram()
3.3 tables
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table
def plot_random_forest_table():
# Data for Random Forest method
data_rf = {
'Aspect': ['Technique', 'Core Concept', 'Training Process', 'Prediction Aggregation', 'Overfitting Mitigation', 'Feature Handling', 'Strengths', 'Application', 'Reference'],
'Description': [
'Ensemble Learning Method',
'Aggregates the predictions of multiple decision trees to enhance accuracy and robustness',
'Creates numerous decision trees, each trained on a subset of the data using a bootstrap sample',
'Combines tree predictions through majority voting (classification) or averaging (regression)',
'Reduces variance and improves generalization by averaging results from multiple trees',
'Manages high-dimensional data effectively and captures complex patterns by considering various features and interactions simultaneously',
'• Handles high-dimensional data well\n• Captures complex interactions\n• Robust to overfitting',
'Useful for predicting healthcare costs by analyzing interactions between variables such as age, BMI, and smoking status',
'Aldahiri, A., Alrashed, B., & Hussain, W. (2021). Trends in using IoT with machine learning in health prediction system. Forecasting.'
]
}
df_rf = pd.DataFrame(data_rf)
# Plot the table
fig, ax = plt.subplots(figsize=(10, 6)) # Set the figure size
ax.axis('off') # Hide axes
tbl = table(ax, df_rf, loc='center', cellLoc='left', colWidths=[0.4] * len(df_rf.columns))
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.2)
ax.set_title('Table: Summary of Random Forest Method', fontsize=14, fontweight='bold')
# Save the table as an image
plt.savefig('/content/drive/MyDrive/Projects/random_forest_table.png', bbox_inches='tight')
plt.show()
plot_random_forest_table()
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table
def plot_gbm_table():
# Data for GBM method
data_gbm = {
'Aspect': ['Method', 'Key Characteristics', 'Process', 'Strengths', 'Application'],
'Description': [
'Gradient Boosting Machines (GBM)',
'Builds models sequentially, each correcting errors of its predecessor',
'Iterative training to refine predictions and capture complex relationships',
'Excels in modelling complex interactions and non-linear patterns, improving prediction accuracy incrementally',
'Effective for predicting healthcare costs by addressing nuances and interactions in the data (Wang, 2021)'
]
}
df_gbm = pd.DataFrame(data_gbm)
# Plot the table
fig, ax = plt.subplots(figsize=(10, 6)) # Set the figure size
ax.axis('off') # Hide axes
tbl = table(ax, df_gbm, loc='center', cellLoc='left', colWidths=[0.4] * len(df_gbm.columns))
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.2)
ax.set_title('Table: Overview of Gradient Boosting Machines (GBM)', fontsize=14, fontweight='bold')
# Save the table as an image
plt.savefig('/content/drive/MyDrive/Projects/gbm_table.png', bbox_inches='tight')
plt.show()
plot_gbm_table()
3.3.3 Table
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table
def plot_xgboost_table():
# Data for XGBoost method
data_xgboost = {
'Aspect': ['Method', 'Key Characteristics', 'Features', 'Strengths', 'Application'],
'Description': [
'XGBoost',
'Optimized variant of GBM with advanced features',
'Regularization techniques to prevent overfitting, parallel processing for faster computation',
'Handles large datasets and complex feature interactions efficiently, improving prediction accuracy and reliability',
'Particularly effective for healthcare cost prediction by addressing limitations of traditional methods (Johnson et al., 2023)'
]
}
df_xgboost = pd.DataFrame(data_xgboost)
# Plot the table
fig, ax = plt.subplots(figsize=(10, 6)) # Set the figure size
ax.axis('off') # Hide axes
tbl = table(ax, df_xgboost, loc='center', cellLoc='left', colWidths=[0.4] * len(df_xgboost.columns))
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.2)
ax.set_title('Table: Overview of XGBoost Method', fontsize=14, fontweight='bold')
# Save the table as an image
plt.savefig('/content/drive/MyDrive/Projects/xgboost_table.png', bbox_inches='tight')
plt.show()
plot_xgboost_table()
3.4.1
import matplotlib.pyplot as plt
def plot_mae_diagram():
# Data for MAE diagram
models = ['Random Forest', 'GBM', 'XGBoost']
mae_values = [0.23, 0.18, 0.15] # Example MAE values; replace with actual values
# Plotting MAE
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(models, mae_values, color=['blue', 'orange', 'green'])
ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('Mean Absolute Error (MAE)', fontsize=12)
ax.set_title('Figure 3.4.1: Mean Absolute Error (MAE) for Each Model', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(mae_values) + 0.1)
# Adding value labels on the bars
for i, value in enumerate(mae_values):
ax.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')
plt.savefig('/content/drive/MyDrive/Projects/mae_diagram.png', bbox_inches='tight')
plt.show()
plot_mae_diagram()
import matplotlib.pyplot as plt
def plot_rmse_diagram():
# Data for RMSE diagram
models = ['Random Forest', 'GBM', 'XGBoost']
rmse_values = [0.30, 0.25, 0.20] # Example RMSE values; replace with actual values
# Plotting RMSE
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(models, rmse_values, color=['blue', 'orange', 'green'])
ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('Root Mean Squared Error (RMSE)', fontsize=12)
ax.set_title('Figure 3.4.2: Root Mean Squared Error (RMSE) for Each Model', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(rmse_values) + 0.1)
# Adding value labels on the bars
for i, value in enumerate(rmse_values):
ax.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')
plt.savefig('/content/drive/MyDrive/Projects/rmse_diagram.png', bbox_inches='tight')
plt.show()
plot_rmse_diagram()
import matplotlib.pyplot as plt
def plot_r_squared_diagram():
# Data for R-Squared diagram
models = ['Random Forest', 'GBM', 'XGBoost']
r_squared_values = [0.75, 0.80, 0.85] # Example R² values; replace with actual values
# Plotting R-Squared
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(models, r_squared_values, color=['blue', 'orange', 'green'])
ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('R-Squared (R²)', fontsize=12)
ax.set_title('Figure 3.4.3: R-Squared (R²) for Each Model', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1.1)
# Adding value labels on the bars
for i, value in enumerate(r_squared_values):
ax.text(i, value + 0.02, f'{value:.2f}', ha='center', va='bottom')
plt.savefig('/content/drive/MyDrive/Projects/r_squared_diagram.png', bbox_inches='tight')
plt.show()
plot_r_squared_diagram()
5 chapter
import matplotlib.pyplot as plt
# Define data
aspects = ['Data Privacy', 'Algorithmic Bias', 'Responsible Use']
percentages = [33, 33, 34] # Assuming equal importance for simplicity
# Create pie chart
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(percentages, labels=aspects, autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'lightcoral'], startangle=140)
ax.set_title('Ethical Aspects of ML in Healthcare', fontsize=16, fontweight='bold')
# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/ethical_aspects_diagram.png', bbox_inches='tight')
plt.show()
5.1
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 10))
# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightblue')
# Define positions and labels for each step
steps = [
("Data Anonymization\n& De-Identification", (0.5, 0.9)),
("Access Controls\n(Role-Based)", (0.5, 0.75)),
("Encryption\n(At Rest & In Transit)", (0.5, 0.6)),
("Informed Consent\n(Transparency)", (0.5, 0.45)),
("Regular Audits\n& Assessments", (0.5, 0.3)),
("Data Stewardship\n(Compliance)", (0.5, 0.15)),
("Regulatory Compliance\n(GDPR, HIPAA)", (0.5, 0.0))
]
# Add text boxes for each step
for step, pos in steps:
ax.text(pos[0], pos[1], step, ha='center', va='center', fontsize=12, bbox=textprops)
# Add arrows between steps
for i in range(len(steps) - 1):
start = steps[i][1]
end = steps[i + 1][1]
ax.annotate('', xy=end, xytext=start,
arrowprops=dict(facecolor='black', shrink=0.05))
# Set title and hide axes
ax.set_title('Data Privacy Considerations in ML Healthcare Applications', fontsize=16, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')
# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/data_privacy_flowchart.png', bbox_inches='tight')
plt.show()
5.2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 12))
# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightgreen')
# Define positions and labels for each step
steps = [
("Biased Training Data", (0.5, 0.9)),
("Bias Mitigation\n Techniques", (0.5, 0.75)),
("Fairness Metrics\n Evaluation", (0.5, 0.6)),
("Ongoing Monitoring\n& Updates", (0.5, 0.45)),
("Transparency\n (Model Explainability)", (0.5, 0.3)),
("Diverse Team\nComposition", (0.5, 0.15)),
("Collaboration\n (Ethicists & Experts)", (0.5, 0.0))
]
# Add text boxes for each step
for step, pos in steps:
ax.text(pos[0], pos[1], step, ha='center', va='center', fontsize=12, bbox=textprops)
# Add arrows between steps
for i in range(len(steps) - 1):
start = steps[i][1]
end = steps[i + 1][1]
ax.annotate('', xy=end, xytext=start,
arrowprops=dict(facecolor='black', shrink=0.05))
# Set title and hide axes
ax.set_title('Addressing Algorithmic Bias in ML Healthcare Models', fontsize=16, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')
# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/algorithmic_bias_flowchart.png', bbox_inches='tight')
plt.show()
5.3
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 12))
# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightblue')
# Define positions and labels for each recommendation
recommendations = [
("Transparency\n& Explainability", (0.5, 0.9)),
("Ethical Governance\nFramework", (0.5, 0.75)),
("Proactive Bias\nMitigation", (0.5, 0.6)),
("Compliance with\nLegal Regulations", (0.5, 0.45)),
("Stakeholder Engagement", (0.5, 0.3)),
("Ongoing Education\n& Training", (0.5, 0.15))
]
# Add text boxes for each recommendation
for rec, pos in recommendations:
ax.text(pos[0], pos[1], rec, ha='center', va='center', fontsize=12, bbox=textprops)
# Add arrows between recommendations
for i in range(len(recommendations) - 1):
start = recommendations[i][1]
end = recommendations[i + 1][1]
ax.annotate('', xy=end, xytext=start,
arrowprops=dict(facecolor='black', shrink=0.05))
# Add an overall title
ax.set_title('Recommendations for Responsible Use of ML in Healthcare', fontsize=16, fontweight='bold')
# Hide axes
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')
# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/responsible_ml_use_recommendations.png', bbox_inches='tight')
plt.show()
6.1
import matplotlib.pyplot as plt
import numpy as np
# Define the models and their performance metrics
models = ['Random Forest', 'Gradient Boosting']
mae_values = [2664.97, 2490.64]
rmse_values = [4634.45, 4438.10]
x = np.arange(len(models)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize=(10, 6))
# Plot MAE and RMSE
rects1 = ax.bar(x - width/2, mae_values, width, label='MAE')
rects2 = ax.bar(x + width/2, rmse_values, width, label='RMSE')
# Add some text for labels, title and custom x-axis tick labels
ax.set_xlabel('Models')
ax.set_ylabel('Values')
ax.set_title('Comparison of ML Models Performance')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
# Add value annotations on bars
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate(f'{height:.2f}',
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/model_performance_comparison.png')
plt.show()
6.1 table
import matplotlib.pyplot as plt
import pandas as pd
# Data for the table
data = {
'Metric': ['Mean Absolute Error (MAE)', 'Root Mean Squared Error (RMSE)', 'Average R² Score'],
'Random Forest': [2664.97, 4634.45, 0.83],
'Gradient Boosting': [2490.64, 4438.10, 0.86]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Plot the table
fig, ax = plt.subplots(figsize=(8, 4)) # set size frame
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df.values, colLabels=df.columns, rowLabels=df['Metric'], cellLoc='center', loc='center')
# Save and show the table
plt.savefig('/content/drive/MyDrive/Projects/model_performance_summary.png', bbox_inches='tight')
plt.show()
6.2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))
# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral']
stakeholders = ['Healthcare Providers', 'Insurance Companies', 'Policymakers']
implications = [
'Optimize resource allocation and patient management\nTailor services to high-risk populations\nReduce overall healthcare expenditures',
'Enhance risk assessment and pricing strategies\nSet premiums more accurately\nDevelop personalized insurance products',
'Improve transparency and fairness\nInform policy decisions\nDevelop equitable strategies for resource distribution'
]
coordinates = [(0.2, 0.8), (0.2, 0.5), (0.2, 0.2)]
# Add stakeholder boxes
for (x, y), stakeholder, implication, color in zip(coordinates, stakeholders, implications, colors):
rect = patches.FancyBboxPatch((x - 0.1, y - 0.1), 0.4, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
ax.add_patch(rect)
ax.text(x, y, stakeholder, ha='center', va='center', fontsize=12, fontweight='bold')
ax.text(x, y - 0.05, implication, ha='center', va='center', fontsize=10, wrap=True)
# Draw arrows
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
arrowprops=dict(arrowstyle='->', lw=2))
# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Title
plt.title('Implications of ML Models for Stakeholders in Healthcare', fontsize=14, fontweight='bold')
# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/ml_impact_stakeholders.png')
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
# Data for the table
data = {
'Stakeholder': ['Healthcare Providers', 'Insurance Companies', 'Policymakers'],
'Implications': [
'Optimize resource allocation and patient management; Tailor services to high-risk populations; Reduce overall healthcare expenditures',
'Enhance risk assessment and pricing strategies; Set premiums more accurately; Develop personalized insurance products',
'Improve transparency and fairness; Inform policy decisions; Develop equitable strategies for resource distribution'
]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Plot the table
fig, ax = plt.subplots(figsize=(10, 5)) # set size frame
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='left', loc='center', colColours=['#f5f5f5']*2)
# Save and show the table
plt.savefig('/content/drive/MyDrive/Projects/ml_implications_table.png', bbox_inches='tight')
plt.show()
6.3
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 10))
# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightgoldenrodyellow']
research_areas = [
'Advanced ML Algorithms',
'Incorporating Additional Features',
'Fairness and Equity in ML',
'Real-World Integration'
]
details = [
'Explore deep learning techniques\n(e.g., CNNs, RNNs)\nAssess performance vs. traditional models',
'Include additional variables\n(e.g., socioeconomic status, longitudinal data)\nEnhance prediction precision',
'Develop fairness-aware algorithms\nImplement bias correction methods\nEnsure equity across demographics',
'Evaluate ML model implementation in real-world settings\nAssess integration with health information systems\nIdentify practical challenges'
]
coordinates = [(0.2, 0.8), (0.2, 0.6), (0.2, 0.4), (0.2, 0.2)]
# Add research area boxes
for (x, y), area, detail, color in zip(coordinates, research_areas, details, colors):
rect = patches.FancyBboxPatch((x - 0.15, y - 0.1), 0.3, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
ax.add_patch(rect)
ax.text(x, y + 0.05, area, ha='center', va='center', fontsize=12, fontweight='bold')
ax.text(x, y - 0.05, detail, ha='center', va='center', fontsize=10, wrap=True)
# Draw arrows
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
arrowprops=dict(arrowstyle='->', lw=2))
# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Title
plt.title('Future Research Directions in ML for Healthcare Cost Prediction', fontsize=14, fontweight='bold')
# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/future_research_directions.png')
plt.show()
6.4
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 10))
# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightgoldenrodyellow']
topics = [
'ML Models: Random Forest vs. Gradient Boosting',
'Accuracy Improvement',
'Ethical Considerations',
'Future Research Directions'
]
details = [
'Gradient Boosting: MAE = 2,490.64\nRMSE = 4,438.10\nOutperforms Random Forest',
'Enhanced Predictive Accuracy\nBetter Resource Allocation\nFinancial Planning',
'Transparency\nData Privacy\nBias Mitigation',
'Advanced Algorithms\nBroader Feature Sets\nReal-World Implementation'
]
coordinates = [(0.2, 0.8), (0.2, 0.6), (0.2, 0.4), (0.2, 0.2)]
# Add topic boxes
for (x, y), topic, detail, color in zip(coordinates, topics, details, colors):
rect = patches.FancyBboxPatch((x - 0.15, y - 0.1), 0.3, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
ax.add_patch(rect)
ax.text(x, y + 0.05, topic, ha='center', va='center', fontsize=12, fontweight='bold')
ax.text(x, y - 0.05, detail, ha='center', va='center', fontsize=10, wrap=True)
# Draw arrows between topics
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
arrowprops=dict(arrowstyle='->', lw=2))
# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
# Title
plt.title('Final Thoughts on ML for Healthcare Cost Prediction', fontsize=14, fontweight='bold')
# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/final_thoughts_concept_map.png')
plt.show()
from google.colab import files
import nbformat
from nbconvert import HTMLExporter
# Step 1: Upload the .ipynb file
uploaded = files.upload()
# Assuming only one file is uploaded, get the file name
notebook_filename = next(iter(uploaded))
# Step 2: Convert the .ipynb file to HTML
with open(notebook_filename) as f:
notebook_content = f.read()
# Parse the notebook content
notebook = nbformat.reads(notebook_content, as_version=4)
# Convert to HTML
html_exporter = HTMLExporter()
(html_content, _) = html_exporter.from_notebook_node(notebook)
# Step 3: Save the HTML content to a file
html_filename = notebook_filename.replace('.ipynb', '.html')
with open(html_filename, 'w') as f:
f.write(html_content)
# Step 4: Download the HTML file
files.download(html_filename)
Saving Maheshinsurance (3).ipynb to Maheshinsurance (3) (1).ipynb
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-4-b42eeead7a05> in <cell line: 20>() 18 # Convert to HTML 19 html_exporter = HTMLExporter() ---> 20 (html_content, _) = html_exporter.from_notebook_node(notebook) 21 22 # Step 3: Save the HTML content to a file /usr/local/lib/python3.10/dist-packages/nbconvert/exporters/html.py in from_notebook_node(self, nb, resources, **kw) 221 self.register_filter("highlight_code", highlight_code) 222 self.register_filter("filter_data_type", filter_data_type) --> 223 return super().from_notebook_node(nb, resources, **kw) 224 225 def _init_resources(self, resources): /usr/local/lib/python3.10/dist-packages/nbconvert/exporters/templateexporter.py in from_notebook_node(self, nb, resources, **kw) 411 412 # Top level variables are passed to the template_exporter here. --> 413 output = self.template.render(nb=nb_copy, resources=resources) 414 output = output.lstrip("\r\n") 415 return output, resources /usr/local/lib/python3.10/dist-packages/jinja2/environment.py in render(self, *args, **kwargs) 1302 return self.environment.concat(self.root_render_func(ctx)) # type: ignore 1303 except Exception: -> 1304 self.environment.handle_exception() 1305 1306 async def render_async(self, *args: t.Any, **kwargs: t.Any) -> str: /usr/local/lib/python3.10/dist-packages/jinja2/environment.py in handle_exception(self, source) 937 from .debug import rewrite_traceback_stack 938 --> 939 raise rewrite_traceback_stack(source=source) 940 941 def join_path(self, template: str, parent: str) -> str: /usr/local/share/jupyter/nbconvert/templates/lab/index.html.j2 in top-level template code() 1 {%- extends 'base.html.j2' -%} 2 {% from 'mathjax.html.j2' import mathjax %} ----> 3 {% from 'jupyter_widgets.html.j2' import jupyter_widgets %} 4 5 {%- block header -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in top-level template code() 1 {%- extends 'display_priority.j2' -%} ----> 2 {% from 'celltags.j2' import celltags %} 3 4 {% block codecell %} 5 {%- if not cell.outputs -%} /usr/local/share/jupyter/nbconvert/templates/base/display_priority.j2 in top-level template code() ----> 1 {%- extends 'base/null.j2' -%} 2 3 {#display data priority#} 4 5 /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in top-level template code() 24 {%- block header -%} 25 {%- endblock header -%} ---> 26 {%- block body -%} 27 {%- block body_header -%} 28 {%- endblock body_header -%} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'body'() 27 {%- block body_header -%} 28 {%- endblock body_header -%} ---> 29 {%- block body_loop -%} 30 {%- for cell in nb.cells -%} 31 {%- block any_cell scoped -%} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'body_loop'() 29 {%- block body_loop -%} 30 {%- for cell in nb.cells -%} ---> 31 {%- block any_cell scoped -%} 32 {%- if cell.cell_type == 'code'-%} 33 {%- if resources.global_content_filter.include_code -%} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'any_cell'() 32 {%- if cell.cell_type == 'code'-%} 33 {%- if resources.global_content_filter.include_code -%} ---> 34 {%- block codecell scoped -%} 35 {%- if resources.global_content_filter.include_input and not cell.get("transient",{}).get("remove_source", false) -%} 36 {%- block input_group -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'codecell'() 10 {%- endif -%} 11 <div class="jp-Cell jp-CodeCell jp-Notebook-cell {{ no_output_class }} {{ no_input_class }} {{ celltags(cell) }}"> ---> 12 {{ super() }} 13 </div> 14 {%- endblock codecell %} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'codecell'() 42 {%- endif -%} 43 {%- if cell.outputs and resources.global_content_filter.include_output -%} ---> 44 {%- block output_group -%} 45 {%- if resources.global_content_filter.include_output_prompt -%} 46 {%- block output_prompt -%}{%- endblock output_prompt -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'output_group'() 36 <div class="jp-Collapser jp-OutputCollapser jp-Cell-outputCollapser"> 37 </div> ---> 38 {{ super() }} 39 </div> 40 {% endblock output_group %} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'output_group'() 46 {%- block output_prompt -%}{%- endblock output_prompt -%} 47 {%- endif -%} ---> 48 {%- block outputs scoped -%} 49 {%- for output in cell.outputs -%} 50 {%- block output scoped -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'outputs'() 42 {% block outputs %} 43 <div class="jp-OutputArea jp-Cell-outputArea"> ---> 44 {{ super() }} 45 </div> 46 {% endblock outputs %} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'outputs'() 48 {%- block outputs scoped -%} 49 {%- for output in cell.outputs -%} ---> 50 {%- block output scoped -%} 51 {%- if output.output_type == 'execute_result' -%} 52 {%- block execute_result scoped -%}{%- endblock execute_result -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'output'() 85 {{ self.output_area_prompt() }} 86 {% endif %} ---> 87 {{ super() }} 88 </div> 89 {% endblock output %} /usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'output'() 50 {%- block output scoped -%} 51 {%- if output.output_type == 'execute_result' -%} ---> 52 {%- block execute_result scoped -%}{%- endblock execute_result -%} 53 {%- elif output.output_type == 'stream' -%} 54 {%- block stream scoped -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'execute_result'() 123 {% block execute_result -%} 124 {%- set extra_class="jp-OutputArea-executeResult" -%} --> 125 {% block data_priority scoped %} 126 {{ super() }} 127 {% endblock data_priority %} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'data_priority'() 124 {%- set extra_class="jp-OutputArea-executeResult" -%} 125 {% block data_priority scoped %} --> 126 {{ super() }} 127 {% endblock data_priority %} 128 {%- set extra_class="" -%} /usr/local/share/jupyter/nbconvert/templates/base/display_priority.j2 in block 'data_priority'() 10 {%- endblock -%} 11 {%- elif type == 'image/svg+xml' -%} ---> 12 {%- block data_svg -%} 13 {%- endblock -%} 14 {%- elif type == 'image/png' -%} /usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'data_svg'() 160 <img src="{{ output.svg_filename | posix_path | escape_html }}"> 161 {%- else %} --> 162 {{ output.data['image/svg+xml'] | clean_html }} 163 {%- endif %} 164 </div> /usr/local/lib/python3.10/dist-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so in lxml.html.clean.Cleaner.clean_html() /usr/local/lib/python3.10/dist-packages/lxml/html/__init__.py in fromstring(html, base_url, parser, **kw) 871 else: 872 is_full_html = _looks_like_full_html_unicode(html) --> 873 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 874 if is_full_html: 875 return doc /usr/local/lib/python3.10/dist-packages/lxml/html/__init__.py in document_fromstring(html, parser, ensure_head_body, **kw) 757 if parser is None: 758 parser = html_parser --> 759 value = etree.fromstring(html, parser, **kw) 760 if value is None: 761 raise etree.ParserError( src/lxml/etree.pyx in lxml.etree.fromstring() src/lxml/parser.pxi in lxml.etree._parseMemoryDocument() ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
# Define the start date for the project
start_date = datetime(2024, 7, 1)
# Define the tasks and their durations
tasks = {
"Draft Chapter 1 & 2": [start_date, start_date + timedelta(weeks=1)],
"Design Methodology": [start_date + timedelta(weeks=1), start_date + timedelta(weeks=2)],
"Data Collection & Preprocessing": [start_date + timedelta(weeks=2), start_date + timedelta(weeks=3)],
"Develop ML Models": [start_date + timedelta(weeks=3), start_date + timedelta(weeks=4)],
"Model Training & Evaluation": [start_date + timedelta(weeks=4), start_date + timedelta(weeks=5)],
"Analyze Results & Draft Chapter 5": [start_date + timedelta(weeks=5), start_date + timedelta(weeks=6)],
"Write Chapters 6 & 7": [start_date + timedelta(weeks=6), start_date + timedelta(weeks=7)],
"Final Revisions & Submission": [start_date + timedelta(weeks=7), start_date + timedelta(weeks=8)],
}
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Plot each task
for i, (task, (start, end)) in enumerate(tasks.items()):
ax.barh(task, (end - start).days, left=start, color='skyblue', edgecolor='black')
# Set the date format on the x-axis
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
# Rotate date labels for better readability
plt.xticks(rotation=45)
# Add labels and title
plt.xlabel('Date')
plt.ylabel('Tasks')
plt.title('8-Week Project Plan Gantt Chart')
# Display the grid
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()